import pandas as pd
import re
import matplotlib.pyplot as plt
import os
import plotly.express as px
df=pd.read_csv('train.csv')
df.set_index('date',inplace=True)
df.head()
store_sales=df.groupby(by='store')[['sales']].sum()
store_sales
store=store_sales.index
fig = px.bar(store_sales,color=store)
fig.show()
fig = px.histogram(df[df.item==1][['sales']],labels=dict(value="Sales"))
fig.show()
fig = px.line(df[(df.item==1) & (df.store==4)][['sales']],y='sales')
fig.show()
df_1_1=df[(df.item==1) & (df.store==1)][['sales']]
fig = px.line(df_1_1)
fig.show()
mean,variance and co-variance is constant over periods
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df_1_1, model='additive', period=365)
plt.figure(figsize=(36, 24))
result.plot()
plt.show()
df_1_1.iloc[1400:,].plot()
pd.options.plotting.backend = "plotly"
import statsmodels.api as sm
fig, ax = plt.subplots(figsize=(36,24))
sm.graphics.tsa.plot_acf(df_1_1, ax=ax)
plt.show()
The null hypothesis of the test is that the time series can be represented by a unit root, that it is not stationary (has some time-dependent structure). The alternate hypothesis (rejecting the null hypothesis) is that the time series is stationary.
Null Hypothesis (H0): If failed to be rejected, it suggests the time series has a unit root, meaning it is non-stationary. It has some time dependent structure. Alternate Hypothesis (H1): The null hypothesis is rejected; it suggests the time series does not have a unit root, meaning it is stationary. It does not have time-dependent structure. We interpret this result using the p-value from the test. A p-value below a threshold (such as 5% or 1%) suggests we reject the null hypothesis (stationary), otherwise a p-value above the threshold suggests we fail to reject the null hypothesis (non-stationary).
p-value > 0.05: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.
from statsmodels.tsa.stattools import adfuller
hypothesis_test=adfuller(df_1_1)
print('ADF Statistic: %f' % hypothesis_test[0])
print('p-value: %f' % hypothesis_test[1])
print('Critical Values:')
for key, value in hypothesis_test[4].items():
print('\t%s: %.3f' % (key, value))
fig = px.histogram(df_1_1)
fig.show()
p<0.05 so we reject the null hypothesis and the Time series is Stationary
df_1_1.diff(periods=1).fillna(0).head()
df_diff=df_1_1.diff(periods=1) #Integrated of order 1 denoted by d
df_diff=df_diff[1:]
df_diff.head()## 1 Lag
fig = px.line(df_diff)
fig.show()
from statsmodels.tsa.arima.model import ARIMA
import itertools
p=d=q=range(0,5)
pdq =list(itertools.product(p,d,q))
X = df_1_1.values
size = int(len(X) * 0.66)
predictions = []
X = df_1_1.values
size = int(len(X) * 0.88)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
train_date=df_1_1.index[0:size]
test_date=df_1_1.index[size:len(X)]
import warnings
warnings.filterwarnings("ignore")
AIC={}
for i in pdq:
try:
model_arima=ARIMA(train,order=(i))
model_fit=model_arima.fit()
print(model_fit.aic," ",i)
AIC[model_fit.aic]=i
except:
continue
AIC[min(AIC.keys())]
model_arima=ARIMA(train,order=(2,1,3))
model_fit=model_arima.fit()
model_fit.summary()
residuals=pd.DataFrame(model_fit.resid)
residuals.plot()
print(residuals.describe())
predictions=[]
# walk-forward validation
for t in range(len(test)):
model = ARIMA(history, order=(2,1,3))
model_fit = model.fit()
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print('predicted=%f, expected=%f' % (yhat, obs))
# evaluate forecasts
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(test, predictions))
print('Test RMSE: %.3f' % rmse)
# plot forecasts against actual outcomes
plt.figure(figsize=(24,12))
plt.plot(test)
plt.plot(predictions, color='red')
plt.show()
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:',mean_absolute_error(test.reshape(-1),predictions))
plt.figure(figsize=(350,200))
plt.plot(train_date[1600:],train[1600:],'green')
plt.plot(test_date,test,'blue')
plt.plot(test_date,predictions,'red')
plt.savefig('A.png')
plt.xticks(rotation = 90,fontsize=150)
plt.xlabel('Date', fontsize=250)
plt.ylabel('Sales', fontsize=250)
plt.savefig('Arima.png');
#fig = px.line(train_date,train)
#fig.show()
%matplotlib inline
df_pred=pd.DataFrame({'Predictions':predictions},index=test_date)
#df_1_1.merge(df_pred,'inner',on='index')